require(knitr)
## Loading required package: knitr
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(readr)
## 
## Attaching package: 'readr'
## 
## The following objects are masked from 'package:scales':
## 
##     col_factor, col_numeric
#opts_knit$set(root.dir="/run/user//1001//gvfs/smb-share:server=biocldap,share=scratch/merrimanlab/murray/working_dir/extract/TajD/")

POP=""
TD=data.frame()
s=data.frame()
make_graph=function(TD,s){
  ggplot(data = TD, aes(x=BIN_START, y=TajimaD), ylim=c(min(TD$TajimaD -0.5), max(TD$TajimaD + 0.5)))  + 
    geom_point(shape = 16, colour= alpha("black", 1/5)) + 
    #geom_hex() +
    facet_wrap(~CHROM, scales = "free_x")  + 
    geom_hline(aes(yintercept= q1, colour ="quantile"), data=s,)  + 
    geom_hline(aes(yintercept= q2, colour ="quantile"), data=s)  +
    geom_hline(aes(yintercept= mean, colour="mean"), data=s) + 
    scale_colour_manual("",breaks=c("mean","quantile"),values=c("green","red")) + 
    scale_x_continuous( xlab("Chromosome Position (Mbp)")) + 
    ylab("Tajima's D") + 
    ggtitle(paste0(POP," Tajima's D by Chromosome")) + 
    theme( plot.background= element_rect(colour="black",fill=NA), legend.position= c(0.75, 0.12)) + 
    theme_bw()
}
dir="/home/murraycadzow/smb_mounts/smb-share:server=biocldap,share=scratch/merrimanlab/murray/working_dir/extract/TajD/"

get_genes_in_region= function(regions, size){
  library(RMySQL)
  drv = dbDriver("MySQL")
  ensembl_core = dbConnect(drv, user="anonymous", host="ensembldb.ensembl.org", dbname="homo_sapiens_core_75_37", password="")
  for(i in 1:length(regions[,1])){
    chr= as.data.frame(regions)[i,1]
    pos1= as.data.frame(regions)[i,2]
    pos2 = pos1+size
    
    print(dbGetQuery(ensembl_core, paste0("select s.name, g.seq_region_start, g.seq_region_end, x.display_label, s.coord_system_id from  gene g, seq_region s, xref x where s.name =", chr, " AND (", pos1 ," > g.seq_region_start AND ",pos2,"  < g.seq_region_end OR ",pos1," BETWEEN g.seq_region_start AND g.seq_region_end OR ",pos2," BETWEEN g.seq_region_start AND g.seq_region_end)  AND g.display_xref_id = x.xref_id group by x.display_label AND s.seq_region_id = g.seq_region_id order by s.name *1, g.seq_region_start")))
  }
}

for( POP in c("AXIOM","OMNI","CEU","CHB","CHS","GBR","YRI")){
  print(POP)
  
  TD=data.frame()
  for( i in 1:22){
    TD=rbind(TD,read.table(file = paste0(dir,POP,i,".taj_d"), header=TRUE))
  }
  s = TD %>% group_by(CHROM) %>% summarise(mean=mean(TajimaD), sd=sd(TajimaD), min=min(TajimaD), max=max(TajimaD), q1 = quantile(TajimaD, 0.01), q2 = quantile(TajimaD, 0.99))
  print(as.data.frame(round(s, digits=2)))
  plot(make_graph(TD,s))
  top = TD %>% arrange(TajimaD) %>% head(n=10)
    #filter(min(TajimaD) == TajimaD | max(TajimaD) == TajimaD) 
  print(as.data.frame(top))
  get_genes_in_region(as.data.frame(top), size=30000)
}
## [1] "AXIOM"
##    CHROM mean   sd   min  max    q1   q2
## 1      1 1.06 1.22 -2.41 5.26 -1.73 3.86
## 2      2 1.14 1.30 -2.58 5.17 -1.84 3.92
## 3      3 1.05 1.25 -2.44 5.01 -1.80 3.81
## 4      4 1.20 1.28 -2.41 5.13 -1.81 4.03
## 5      5 1.22 1.23 -2.52 4.91 -1.64 3.90
## 6      6 1.12 1.25 -2.35 5.01 -1.73 3.89
## 7      7 1.15 1.29 -2.38 5.10 -1.85 3.87
## 8      8 1.24 1.29 -2.47 4.97 -1.75 4.08
## 9      9 1.02 1.22 -2.39 5.04 -1.66 3.93
## 10    10 1.13 1.24 -2.51 4.81 -1.76 3.88
## 11    11 1.20 1.29 -2.45 4.79 -1.81 3.87
## 12    12 1.10 1.23 -2.58 5.11 -1.88 3.79
## 13    13 1.27 1.24 -2.37 4.69 -1.63 3.97
## 14    14 1.22 1.24 -2.48 4.87 -1.79 3.99
## 15    15 1.17 1.14 -2.18 5.08 -1.56 3.56
## 16    16 0.94 1.21 -2.24 4.71 -1.75 3.65
## 17    17 1.06 1.24 -2.40 4.54 -1.88 3.86
## 18    18 1.14 1.24 -2.51 4.73 -1.80 3.79
## 19    19 1.04 1.15 -2.43 4.48 -1.53 3.58
## 20    20 1.13 1.20 -2.41 4.83 -1.66 3.91
## 21    21 1.22 1.19 -1.97 4.60 -1.69 3.85
## 22    22 1.02 1.14 -2.27 4.50 -1.48 3.71
##    CHROM BIN_START N_SNPS  TajimaD
## 1     12  79140000     71 -2.57685
## 2      2 237420000     62 -2.57669
## 3      2 122340000     59 -2.56972
## 4      5  97110000    196 -2.52491
## 5     18  30660000     48 -2.51456
## 6     10  22560000     66 -2.50805
## 7      2  13560000     68 -2.50327
## 8      2 122160000     38 -2.49352
## 9      2  22230000     39 -2.48278
## 10    14  68490000     35 -2.47674
## Loading required package: DBI

##   name seq_region_start seq_region_end display_label coord_system_id
## 1   12         78493824       79191463    RNF219-AS1               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    2        237205505      237997288          RYR2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    2        122216466      122349367      PPAPDC1A               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    5         97094758       97123230 RP11-307E17.8               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   18         30565801       30660526     LINC00189               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   10         22513451       22567646 RP11-958F21.1               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    2         13217497       13652754       LDLRAD4               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    2        122026130      122293579  RP11-820L6.1               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    2         22208146       22242162  RP11-449D8.1               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   14         68297986       68668670     GNG12-AS1               2
## [1] "OMNI"
##    CHROM mean   sd   min  max    q1   q2
## 1      1 1.18 1.31 -2.58 5.38 -1.79 4.15
## 2      2 1.28 1.40 -2.57 5.05 -1.97 4.14
## 3      3 1.16 1.36 -2.49 5.43 -1.95 4.02
## 4      4 1.31 1.37 -2.47 5.14 -2.02 4.20
## 5      5 1.38 1.33 -2.46 5.78 -1.82 4.14
## 6      6 1.37 1.34 -2.55 5.52 -1.92 4.12
## 7      7 1.38 1.35 -2.65 5.37 -1.94 4.15
## 8      8 1.41 1.35 -2.52 5.14 -1.91 4.12
## 9      9 1.15 1.30 -2.30 5.33 -1.69 4.22
## 10    10 1.29 1.34 -2.45 5.10 -1.80 4.18
## 11    11 1.33 1.32 -2.49 5.63 -1.94 4.05
## 12    12 1.25 1.31 -2.49 5.33 -1.90 4.05
## 13    13 1.35 1.26 -2.43 5.18 -1.56 4.04
## 14    14 1.35 1.30 -2.46 5.13 -1.82 3.99
## 15    15 1.31 1.27 -2.36 4.99 -1.71 4.01
## 16    16 1.04 1.33 -2.62 4.95 -1.88 3.88
## 17    17 1.18 1.29 -2.61 4.96 -1.93 3.93
## 18    18 1.35 1.30 -2.37 5.10 -1.81 4.07
## 19    19 1.33 1.25 -2.21 4.88 -1.55 3.92
## 20    20 1.31 1.28 -2.37 5.10 -1.72 4.01
## 21    21 1.33 1.33 -2.53 5.51 -1.68 4.48
## 22    22 1.19 1.19 -2.46 4.67 -1.68 3.70

##    CHROM BIN_START N_SNPS  TajimaD
## 1      7 137220000    115 -2.65073
## 2     16   2700000     99 -2.61632
## 3     17  58530000    105 -2.61107
## 4      1 247980000    121 -2.57726
## 5      2  68790000     67 -2.57055
## 6      6  71850000    134 -2.55347
## 7      1 247950000     84 -2.54450
## 8      6  69870000     56 -2.53133
## 9     21  23370000    129 -2.52540
## 10     8 127110000    106 -2.52308
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    7        137150022      137225025 RP11-381K20.2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   16          2673524        2740753          EBF4               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   17         58297400       58569959 RP11-325K19.1               2
## [1] name             seq_region_start seq_region_end   display_label   
## [5] coord_system_id 
## <0 rows> (or 0-length row.names)
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    2         68741234       68809906        PGM5P1               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    6         71878626       71907790  RP11-669I1.1               2
## [1] name             seq_region_start seq_region_end   display_label   
## [5] coord_system_id 
## <0 rows> (or 0-length row.names)
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    6         69891208       69901481    AC125634.1               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   21         23381263       23470778    AP000472.2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    8        127019885      127115586          NEK6               2
## [1] "CEU"
##    CHROM mean   sd   min  max    q1   q2
## 1      1 0.86 1.10 -2.65 5.07 -1.74 3.41
## 2      2 0.93 1.10 -2.58 4.61 -1.66 3.33
## 3      3 1.00 1.11 -2.48 5.41 -1.69 3.46
## 4      4 1.03 1.14 -2.46 4.96 -1.70 3.46
## 5      5 1.00 1.08 -2.27 4.35 -1.53 3.38
## 6      6 1.03 1.08 -2.49 4.50 -1.61 3.50
## 7      7 1.01 1.10 -2.34 4.67 -1.70 3.35
## 8      8 0.93 1.14 -2.46 5.19 -1.82 3.44
## 9      9 0.80 1.04 -2.38 4.92 -1.49 3.19
## 10    10 0.98 1.10 -2.25 5.08 -1.64 3.49
## 11    11 1.05 1.08 -2.28 4.75 -1.63 3.47
## 12    12 0.95 1.11 -2.29 4.94 -1.68 3.51
## 13    13 1.02 1.07 -2.46 5.06 -1.50 3.44
## 14    14 0.94 1.10 -2.45 5.35 -1.69 3.38
## 15    15 0.89 1.09 -2.42 4.20 -1.82 3.24
## 16    16 0.76 1.11 -2.01 4.07 -1.66 3.16
## 17    17 1.01 1.08 -2.29 4.34 -1.67 3.36
## 18    18 0.97 1.08 -2.24 4.30 -1.67 3.33
## 19    19 0.94 1.13 -2.55 4.88 -1.89 3.40
## 20    20 0.96 1.06 -2.58 4.11 -1.49 3.47
## 21    21 1.11 1.09 -2.15 5.13 -1.34 3.53
## 22    22 0.98 1.03 -2.15 4.11 -1.27 3.29

##    CHROM BIN_START N_SNPS  TajimaD
## 1      1 188850000    196 -2.64798
## 2     20  41520000     80 -2.58306
## 3      2  21780000     91 -2.58216
## 4      2 182580000    188 -2.56844
## 5     19  32070000     94 -2.55411
## 6      6  73830000     95 -2.49485
## 7      3  26280000    101 -2.47956
## 8      8  35610000     88 -2.46394
## 9     13 104850000     82 -2.45996
## 10     4  13320000    114 -2.45799
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    1        188577543      189152418     LINC01090               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   20         41516046       41523018       FAM74A6               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    2         21665003       22214734        CASC15               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    2        182511288      182639423        ATP11B               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   19         32028898       32145082      NRG1-IT2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    6         73859385       73862680 RP11-531A24.3               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    3         26212864       26430059    AP000235.2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    8         35445892       35732332    AP000320.7               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   13        104864962      104893895         CASP5               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    4         13217497       13652754       LDLRAD4               2
## [1] "CHB"
##    CHROM mean   sd   min  max    q1   q2
## 1      1 1.41 1.30 -2.25 5.41 -1.46 4.19
## 2      2 1.51 1.33 -2.45 5.28 -1.68 4.24
## 3      3 1.49 1.30 -2.30 5.62 -1.58 4.16
## 4      4 1.67 1.29 -2.47 5.50 -1.54 4.40
## 5      5 1.61 1.28 -2.36 5.44 -1.45 4.30
## 6      6 1.61 1.22 -2.36 5.29 -1.34 4.29
## 7      7 1.63 1.27 -2.30 5.13 -1.50 4.22
## 8      8 1.68 1.28 -2.40 5.14 -1.57 4.29
## 9      9 1.31 1.25 -2.23 5.02 -1.32 3.96
## 10    10 1.57 1.25 -2.35 5.65 -1.50 4.20
## 11    11 1.72 1.27 -2.24 5.09 -1.45 4.27
## 12    12 1.57 1.28 -2.21 5.57 -1.57 4.20
## 13    13 1.70 1.22 -2.08 5.20 -1.34 4.29
## 14    14 1.67 1.21 -2.39 5.14 -1.31 4.30
## 15    15 1.60 1.22 -2.00 5.41 -1.38 4.16
## 16    16 1.31 1.35 -2.35 4.83 -1.64 4.06
## 17    17 1.56 1.27 -2.23 5.00 -1.56 4.20
## 18    18 1.65 1.25 -2.08 5.30 -1.21 4.13
## 19    19 1.59 1.29 -2.37 5.17 -1.52 4.20
## 20    20 1.61 1.28 -2.09 5.48 -1.51 4.15
## 21    21 1.73 1.22 -1.92 4.97 -1.01 4.37
## 22    22 1.51 1.24 -2.14 4.55 -1.40 3.99

##    CHROM BIN_START N_SNPS  TajimaD
## 1      4 179820000    209 -2.47443
## 2      2 195030000    148 -2.44921
## 3      8  93030000     53 -2.39828
## 4     14 105780000     89 -2.38921
## 5     19  32070000     76 -2.37235
## 6      2 108960000     76 -2.36440
## 7      6 105870000     40 -2.35785
## 8      5 117510000     81 -2.35769
## 9     16  47820000     95 -2.35057
## 10    10 107220000     62 -2.34669
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    4        179694484      179914813       CCDC141               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    2        194995465      195163807         ACAP2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    8         93014884       93044347      C15orf32               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   14        105575031      105887950 RP11-556I14.2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   19         32028898       32145082      NRG1-IT2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    2        108926358      108963917    SLC25A24P2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    6        105575031      105887950 RP11-556I14.2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    5        116853124      117708503        ATRNL1               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   16         47311134       48143999         MDGA2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   10        106962756      107242652          TBCK               2
## [1] "CHS"
##    CHROM mean   sd   min  max    q1   q2
## 1      1 1.41 1.30 -2.25 5.44 -1.46 4.22
## 2      2 1.52 1.33 -2.41 5.17 -1.62 4.27
## 3      3 1.49 1.30 -2.41 5.95 -1.62 4.22
## 4      4 1.69 1.31 -2.14 5.67 -1.55 4.46
## 5      5 1.59 1.30 -2.30 5.39 -1.45 4.35
## 6      6 1.63 1.26 -2.32 5.49 -1.37 4.31
## 7      7 1.64 1.27 -2.19 5.38 -1.52 4.26
## 8      8 1.68 1.29 -2.30 5.42 -1.54 4.32
## 9      9 1.33 1.26 -2.44 4.72 -1.24 4.02
## 10    10 1.61 1.25 -1.99 5.67 -1.46 4.24
## 11    11 1.73 1.28 -2.13 5.13 -1.53 4.30
## 12    12 1.57 1.30 -2.14 5.10 -1.55 4.25
## 13    13 1.73 1.23 -2.08 5.43 -1.18 4.40
## 14    14 1.72 1.19 -1.82 5.36 -1.14 4.28
## 15    15 1.61 1.22 -2.16 5.49 -1.28 4.12
## 16    16 1.35 1.35 -2.23 5.18 -1.46 4.04
## 17    17 1.55 1.29 -2.39 5.31 -1.49 4.24
## 18    18 1.69 1.26 -2.11 5.34 -1.38 4.24
## 19    19 1.61 1.29 -2.32 5.00 -1.43 4.31
## 20    20 1.63 1.24 -2.36 5.01 -1.27 4.20
## 21    21 1.77 1.22 -1.76 4.97 -0.94 4.48
## 22    22 1.56 1.20 -2.38 4.68 -1.30 4.01

##    CHROM BIN_START N_SNPS  TajimaD
## 1      9  92310000    121 -2.43656
## 2      2 195030000    141 -2.41266
## 3      3  17370000     88 -2.41213
## 4     17  58830000    185 -2.39427
## 5     17  62970000     97 -2.38774
## 6     22  46770000     65 -2.37656
## 7     20  24870000    117 -2.36448
## 8      6 105900000     51 -2.32052
## 9     19  32070000     51 -2.31985
## 10     6 105870000     37 -2.30978
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    9         92339633       92400146         CASC6               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    2        194995465      195163807         ACAP2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    3         17354597       17428082        SLC7A2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   17         58662895       59102516 RP5-1043L13.1               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   17         62931248       62997124      SLC22A25               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   22         46570039       46987717           DYM               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   20         24863206       24924358          UPB1               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    6        105902805      106087316 RP11-341A22.2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   19         32028898       32145082      NRG1-IT2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    6        105575031      105887950 RP11-556I14.2               2
## [1] "GBR"
##    CHROM mean   sd   min  max    q1   q2
## 1      1 0.89 1.10 -2.49 4.87 -1.70 3.37
## 2      2 0.95 1.11 -2.57 4.49 -1.66 3.36
## 3      3 1.02 1.11 -2.56 5.26 -1.68 3.49
## 4      4 1.07 1.16 -2.45 4.90 -1.65 3.58
## 5      5 1.03 1.07 -2.43 4.92 -1.47 3.34
## 6      6 1.07 1.09 -2.42 4.85 -1.61 3.53
## 7      7 1.05 1.11 -2.55 4.81 -1.74 3.52
## 8      8 0.96 1.14 -2.46 5.01 -1.87 3.42
## 9      9 0.82 1.04 -2.38 4.53 -1.51 3.23
## 10    10 1.01 1.07 -2.37 4.90 -1.45 3.47
## 11    11 1.08 1.06 -2.35 5.10 -1.70 3.41
## 12    12 0.98 1.14 -2.38 4.79 -1.68 3.59
## 13    13 1.10 1.07 -2.19 4.65 -1.43 3.45
## 14    14 0.95 1.11 -2.49 4.08 -1.74 3.34
## 15    15 0.99 1.09 -2.31 4.47 -1.70 3.38
## 16    16 0.77 1.14 -2.29 4.07 -1.79 3.21
## 17    17 1.05 1.10 -2.23 4.47 -1.70 3.39
## 18    18 1.02 1.08 -2.09 4.34 -1.58 3.38
## 19    19 0.97 1.12 -2.36 4.51 -1.90 3.40
## 20    20 0.97 1.05 -2.55 4.52 -1.64 3.33
## 21    21 1.11 1.10 -2.38 4.76 -1.48 3.58
## 22    22 1.06 1.03 -2.21 4.31 -1.13 3.29

##    CHROM BIN_START N_SNPS  TajimaD
## 1      2 182580000    186 -2.56737
## 2      3  26280000     99 -2.56121
## 3     20  41520000     76 -2.55033
## 4      7 137220000    169 -2.54973
## 5      2  21780000     88 -2.54636
## 6      1  35640000     43 -2.49439
## 7     14  98070000     85 -2.49139
## 8      8  23820000     62 -2.45889
## 9      4  91320000    191 -2.45368
## 10     1  28020000     50 -2.43462
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    2        182511288      182639423        ATP11B               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    3         26212864       26430059    AP000235.2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   20         41516046       41523018       FAM74A6               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    7        137150022      137225025 RP11-381K20.2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    2         21665003       22214734        CASC15               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    1         35445892       35732332    AP000320.7               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   14         98096514       98103932 RP11-461F11.3               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    8         23817659       23821323      TATDN2P3               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    4         91260558       91358859           BLM               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    1         28000021       28344504          OCA2               2
## [1] "YRI"
##    CHROM mean   sd   min  max    q1   q2
## 1      1 0.42 0.71 -1.80 4.06 -1.12 2.35
## 2      2 0.46 0.70 -1.91 3.92 -1.07 2.26
## 3      3 0.50 0.70 -1.75 4.32 -1.02 2.44
## 4      4 0.52 0.75 -1.90 3.88 -1.10 2.56
## 5      5 0.45 0.71 -1.90 4.03 -1.08 2.33
## 6      6 0.51 0.76 -1.70 4.12 -1.09 2.62
## 7      7 0.48 0.74 -1.62 4.34 -1.12 2.41
## 8      8 0.46 0.70 -1.81 3.41 -1.01 2.25
## 9      9 0.41 0.68 -1.77 3.42 -1.05 2.31
## 10    10 0.50 0.72 -1.87 3.64 -1.04 2.51
## 11    11 0.48 0.71 -1.51 3.65 -1.05 2.44
## 12    12 0.47 0.72 -1.81 3.91 -1.09 2.44
## 13    13 0.52 0.74 -1.83 4.12 -1.03 2.47
## 14    14 0.49 0.70 -1.41 4.30 -0.98 2.39
## 15    15 0.48 0.68 -1.89 3.76 -1.01 2.19
## 16    16 0.39 0.67 -1.70 3.17 -1.10 2.10
## 17    17 0.45 0.69 -1.79 3.02 -1.08 2.26
## 18    18 0.49 0.66 -1.71 3.01 -0.92 2.09
## 19    19 0.55 0.72 -1.85 3.75 -1.15 2.39
## 20    20 0.50 0.66 -1.61 3.12 -1.07 2.23
## 21    21 0.58 0.73 -1.46 4.87 -0.82 2.73
## 22    22 0.53 0.70 -1.31 3.37 -1.04 2.25

##    CHROM BIN_START N_SNPS  TajimaD
## 1      2  84390000    162 -1.90543
## 2      4  74910000    186 -1.90023
## 3      5  83820000    129 -1.89752
## 4     15  24480000    536 -1.89199
## 5     10 134430000    128 -1.87367
## 6     19  29160000    256 -1.85046
## 7     13  53010000    152 -1.82883
## 8     12  74250000    204 -1.81119
## 9      8  99570000     76 -1.80970
## 10     1 175230000    205 -1.79954
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    2         84304628       84391815 RP11-154D17.1               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    4         74920346       74958126  RP11-63P12.6               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    5         83847934       84108197 RP11-382A20.4               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   15         24437813       24503528    AP001255.2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   10        134316643      134979309         EPHB1               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   19         29143289       29163375    AL935156.1               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   13         52949815       53137158    AC010967.2               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1   12         74233341       74280319  RP11-505P4.7               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    8         99497378       99646030 RP11-654A16.3               2
##   name seq_region_start seq_region_end display_label coord_system_id
## 1    1        174156363      175523428      NAALADL2               2

Clustering of populations

Distance matrix is created from taking 1000 most negative TD results and finding the number of windows that overlap between each population

library("RMySQL")

drv=dbDriver(drvName = "MySQL")
db=dbConnect(drv, host="127.0.0.1", user="murray", db="selection")
axiomTD= dbGetQuery(db, "select * from tajimasd where Population = 'AXIOM' and TajimasD < 0 order by TajimasD limit 1000;")
omniTD=dbGetQuery(db, "select * from tajimasd where Population = 'OMNI' and TajimasD < 0 order by TajimasD limit 1000;")
ceuTD=dbGetQuery(db, "select * from tajimasd where Population = 'CEU' and TajimasD < 0 order by TajimasD limit 1000;")
chbTD=dbGetQuery(db, "select * from tajimasd where Population = 'CHB' and TajimasD < 0 order by TajimasD limit 1000;")
chsTD=dbGetQuery(db, "select * from tajimasd where Population = 'CHS' and TajimasD < 0 order by TajimasD limit 1000;")
gbrTD=dbGetQuery(db, "select * from tajimasd where Population = 'GBR' and TajimasD < 0 order by TajimasD limit 1000;")
yriTD=dbGetQuery(db, "select * from tajimasd where Population = 'YRI' and TajimasD < 0 order by TajimasD limit 1000;")

overlap = function(pop1, pop2){
  return(length(merge(pop1, pop2, by = c("chrom", "chrom_start"))[,1]))
}

axiom_omniTD = overlap(axiomTD,omniTD)
axiom_ceuTD = overlap(axiomTD,ceuTD)
axiom_chbTD =overlap(axiomTD,chbTD)
axiom_chsTD = overlap(axiomTD,chsTD)
axiom_gbrTD = overlap(axiomTD,gbrTD) 
axiom_yriTD = overlap(axiomTD,yriTD)

omni_ceuTD = overlap(omniTD,ceuTD)
omni_chbTD = overlap(omniTD,chbTD)
omni_chsTD = overlap(omniTD,chsTD)
omni_gbrTD = overlap(omniTD,gbrTD)
omni_yriTD = overlap(omniTD,yriTD)

ceu_chbTD = overlap(ceuTD,chbTD)
ceu_chsTD = overlap(ceuTD,chsTD)
ceu_gbrTD = overlap(ceuTD,gbrTD)
ceu_yriTD = overlap(ceuTD,yriTD)

chb_chsTD = overlap(chbTD, chsTD)
chb_gbrTD = overlap(chbTD, gbrTD)
chb_yriTD = overlap(chbTD, yriTD)

chs_gbrTD = overlap(chsTD, gbrTD)
chs_yriTD = overlap(chsTD, yriTD)

gbr_yriTD = overlap(gbrTD,yriTD)

# make distance matrix
dist = matrix(nrow = 7, ncol=7)
colnames(dist) = c('axiom','omni','ceu','chb','chs','gbr','yri')
rownames(dist) = c('axiom','omni','ceu','chb','chs','gbr','yri')
dist[,1]= c(0,axiom_omniTD,axiom_ceuTD,axiom_chbTD,axiom_chsTD,axiom_gbrTD,axiom_yriTD)
dist[,2]= c(axiom_omniTD,0,omni_ceuTD,omni_chbTD,omni_chsTD,omni_gbrTD,omni_yriTD)
dist[,3]= c(axiom_ceuTD,omni_ceuTD,0,ceu_chbTD,ceu_chsTD,ceu_gbrTD,ceu_yriTD)
dist[,4]= c(axiom_ceuTD,omni_ceuTD,ceu_chbTD,0,chb_chsTD,chb_gbrTD,chb_yriTD)
dist[,5]= c(axiom_ceuTD,omni_ceuTD,ceu_chbTD,chb_chsTD,0,chs_gbrTD,chs_yriTD)
dist[,6]= c(axiom_ceuTD,omni_ceuTD,ceu_chbTD,chb_chsTD,chs_gbrTD,0,gbr_yriTD)
dist[1,]=t(dist[,1])
dist[2,]=t(dist[,2])
dist[3,]=t(dist[,3])
dist[4,]=t(dist[,4])
dist[5,]=t(dist[,5])
dist[6,]=t(dist[,6])
dist2 = 1/dist
dist2[1,1]=0;dist2[2,2]=0;dist2[3,3]=0;dist2[4,4]=0;dist2[5,5]=0;dist2[6,6]=0;dist2[7,7]=0
plot(hclust(dist(dist2)))

dist
##       axiom omni ceu chb chs gbr yri
## axiom     0  296 133 181 200 129  37
## omni    296    0 116 237 235 116  30
## ceu     133  116   0  99 109 577  52
## chb     181  237  99   0 518 105  31
## chs     200  235 109 518   0 115  39
## gbr     129  116 577 105 115   0  36
## yri      37   30  52  31  39  36  NA
#intersection of axiom and omni TAJIMAs D
library(VennDiagram)
## Loading required package: grid
grid.newpage()
draw.pairwise.venn(length(axiomTD[,1]),length(omniTD[,1]),length(merge(axiomTD, omniTD, by = c("chrom", "chrom_start"))[,1]), c("NZ Maori", "Samoan"), col = c("red","blue"), fill = c("red","blue"))

## (polygon[GRID.polygon.6966], polygon[GRID.polygon.6967], polygon[GRID.polygon.6968], polygon[GRID.polygon.6969], text[GRID.text.6970], text[GRID.text.6971], text[GRID.text.6972], text[GRID.text.6973], text[GRID.text.6974])